Plan na spotkanie

Pobierz plik df.rda i wykonaj na nim poniższe zadania https://github.com/pbiecek/LinearModels/blob/master/MIMUW_2017/Lab/df.rda

  1. Wykonaj analizę jednokierunkową wariancji ze zmienną V1. Ustaw poziom B jako poziom referencyjny.
load("df.rda")
summary(df)
##        y          V1      V2            V3            V4          
##  Min.   :-1.551   A:100   A:100   A      : 30   Min.   :0.004302  
##  1st Qu.: 4.277   B:100   B:100   B      : 30   1st Qu.:0.256051  
##  Median :13.517   C:100   C:100   C      : 30   Median :0.473608  
##  Mean   :13.842                   D      : 30   Mean   :0.487082  
##  3rd Qu.:20.949                   E      : 30   3rd Qu.:0.711717  
##  Max.   :42.344                   F      : 30   Max.   :0.996221  
##                                   (Other):120                     
##        V5                  V6                V7         
##  Min.   :-0.745221   Min.   :0.01059   Min.   :0.00199  
##  1st Qu.:-0.347411   1st Qu.:0.25362   1st Qu.:0.31153  
##  Median : 0.009391   Median :0.62682   Median :0.69831  
##  Mean   : 0.020876   Mean   :0.94575   Mean   :0.99212  
##  3rd Qu.: 0.417608   3rd Qu.:1.25495   3rd Qu.:1.42001  
##  Max.   : 0.749266   Max.   :6.81940   Max.   :5.50225  
##                                                         
##        V8                 V9                V10          
##  Min.   :0.000702   Min.   :0.008554   Min.   :0.003723  
##  1st Qu.:0.301992   1st Qu.:0.271763   1st Qu.:0.276671  
##  Median :0.730726   Median :0.684014   Median :0.691662  
##  Mean   :1.020246   Mean   :1.077737   Mean   :0.940086  
##  3rd Qu.:1.438641   3rd Qu.:1.468661   3rd Qu.:1.301117  
##  Max.   :6.677329   Max.   :8.122376   Max.   :7.444449  
## 
ggplot(df, aes(V1,y)) + geom_boxplot()

summary(lm(formula = y~V1, data = df))
## 
## Call:
## lm(formula = y ~ V1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.258  -6.339  -1.739   6.110  18.245 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.1211     0.7538  12.101   <2e-16 ***
## V1B          14.9776     1.0660  14.051   <2e-16 ***
## V1C          -0.8150     1.0660  -0.765    0.445    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.538 on 297 degrees of freedom
## Multiple R-squared:  0.4838, Adjusted R-squared:  0.4803 
## F-statistic: 139.2 on 2 and 297 DF,  p-value: < 2.2e-16

Przed przepoziomowaniem zmiennej V1, tylko poziom B statystycznie istotnie wpływa na y.

df <- df %>% mutate(V1 = fct_relevel(V1, "B"))
m <- lm(formula = y~V1, data = df)
summary(m)
## 
## Call:
## lm(formula = y ~ V1, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.258  -6.339  -1.739   6.110  18.245 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  24.0987     0.7538   31.97   <2e-16 ***
## V1A         -14.9776     1.0660  -14.05   <2e-16 ***
## V1C         -15.7926     1.0660  -14.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.538 on 297 degrees of freedom
## Multiple R-squared:  0.4838, Adjusted R-squared:  0.4803 
## F-statistic: 139.2 on 2 and 297 DF,  p-value: < 2.2e-16
anova(m)
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## V1          2  15813  7906.6  139.16 < 2.2e-16 ***
## Residuals 297  16874    56.8                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Zmienna V1 statystycznie istotnie objaśnia zmienna y.

  1. Połącz w zmiennych V1 i V2 poziomy B i C ze sobą, a następnie wykonaj test weryfikujący istotność interakcji.
ggplot(df, aes(V2, y)) + geom_boxplot()

summary(lm(y~V2, data = df))
## 
## Call:
## lm(formula = y ~ V2, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.7799  -9.8480   0.1353   6.8259  28.7883 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.331      1.043  11.821   <2e-16 ***
## V2B            1.898      1.475   1.287   0.1991    
## V2C            2.635      1.475   1.786   0.0751 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.43 on 297 degrees of freedom
## Multiple R-squared:  0.01131,    Adjusted R-squared:  0.004654 
## F-statistic: 1.699 on 2 and 297 DF,  p-value: 0.1846

Pierwszys test bez połaczonych poziomów nie wykazuje ze zmienna V2 istotnie objaśnia y.

unified <- df %>%
  mutate(V1 = fct_collapse(V1, BC = c("B", "C")),
         V2 = fct_collapse(V2, BC = c("B", "C")))
m_v1_v2_no_interaction <- lm(y ~ V1 + V2, data = unified)
m_v1_v2_interaction <- lm(y ~ V1 * V2, data = unified)
anova(m_v1_v2_no_interaction, m_v1_v2_interaction)
## Analysis of Variance Table
## 
## Model 1: y ~ V1 + V2
## Model 2: y ~ V1 * V2
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    297 29134                           
## 2    296 28980  1    153.72 1.5701 0.2112
interaction.plot(unified$V2,unified$V1, unified$y)

Test F wskazuje na to że interakcja miedzy zmiennymi V1 i V2 nie opisuje istotnie y.

  1. Dla zmiennej V1 porównaj wyniki dla różnych kontrastów, przynajmniej Helmerta, poly i sum.
m_helmert <- lm(y~V1, data = df, contrasts = list(V1=contr.helmert(3)))
m_poly <- lm(y~V1, data = df, contrasts = list(V1=contr.poly(3)))
m_sum <- lm(y~V1, data = df, contrasts = list(V1=contr.sum(3)))
head(model.matrix(m_helmert))
##   (Intercept) V11 V12
## 1           1   1  -1
## 2           1  -1  -1
## 3           1   0   2
## 4           1   1  -1
## 5           1  -1  -1
## 6           1   0   2
summary(m_helmert)
## 
## Call:
## lm(formula = y ~ V1, data = df, contrasts = list(V1 = contr.helmert(3)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.258  -6.339  -1.739   6.110  18.245 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.8419     0.4352  31.807   <2e-16 ***
## V11          -7.4888     0.5330 -14.051   <2e-16 ***
## V12          -2.7679     0.3077  -8.995   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.538 on 297 degrees of freedom
## Multiple R-squared:  0.4838, Adjusted R-squared:  0.4803 
## F-statistic: 139.2 on 2 and 297 DF,  p-value: < 2.2e-16
head(model.matrix(m_poly))
##   (Intercept)          V1.L       V1.Q
## 1           1 -7.850462e-17 -0.8164966
## 2           1 -7.071068e-01  0.4082483
## 3           1  7.071068e-01  0.4082483
## 4           1 -7.850462e-17 -0.8164966
## 5           1 -7.071068e-01  0.4082483
## 6           1  7.071068e-01  0.4082483
summary(m_poly)
## 
## Call:
## lm(formula = y ~ V1, data = df, contrasts = list(V1 = contr.poly(3)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.258  -6.339  -1.739   6.110  18.245 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.8419     0.4352  31.807  < 2e-16 ***
## V1.L        -11.1670     0.7538 -14.815  < 2e-16 ***
## V1.Q          5.7819     0.7538   7.671 2.47e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.538 on 297 degrees of freedom
## Multiple R-squared:  0.4838, Adjusted R-squared:  0.4803 
## F-statistic: 139.2 on 2 and 297 DF,  p-value: < 2.2e-16
head(model.matrix(m_sum))
##   (Intercept) V11 V12
## 1           1   0   1
## 2           1   1   0
## 3           1  -1  -1
## 4           1   0   1
## 5           1   1   0
## 6           1  -1  -1
summary(m_sum)
## 
## Call:
## lm(formula = y ~ V1, data = df, contrasts = list(V1 = contr.sum(3)))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.258  -6.339  -1.739   6.110  18.245 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13.8419     0.4352  31.807  < 2e-16 ***
## V11          10.2567     0.6154  16.666  < 2e-16 ***
## V12          -4.7209     0.6154  -7.671 2.47e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.538 on 297 degrees of freedom
## Multiple R-squared:  0.4838, Adjusted R-squared:  0.4803 
## F-statistic: 139.2 on 2 and 297 DF,  p-value: < 2.2e-16
  1. Wykonaj test post hoc dla zmiennej V3. Które poziomy różnią się pomiędzy sobą?

Na poczatek przeprowadzmy analizę wariancji dla zmiennej V3.

m.V3 <- lm(y ~ V3, data = df)
summary(m.V3)
## 
## Call:
## lm(formula = y ~ V3, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.4375  -8.8835  -0.7477   6.8117  28.6437 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  15.9471     1.9132   8.335 3.14e-15 ***
## V3B          -3.3902     2.7057  -1.253   0.2112    
## V3C          -2.8994     2.7057  -1.072   0.2848    
## V3D          -2.2470     2.7057  -0.830   0.4069    
## V3E          -0.9555     2.7057  -0.353   0.7242    
## V3F          -2.2243     2.7057  -0.822   0.4117    
## V3G          -2.4525     2.7057  -0.906   0.3655    
## V3H           0.9105     2.7057   0.337   0.7367    
## V3I          -2.4153     2.7057  -0.893   0.3728    
## V3J          -5.3778     2.7057  -1.988   0.0478 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.48 on 290 degrees of freedom
## Multiple R-squared:  0.02578,    Adjusted R-squared:  -0.004453 
## F-statistic: 0.8527 on 9 and 290 DF,  p-value: 0.5682

Nastepnie test HSD Tukey’a

plot(TukeyHSD(aov(m.V3)))

HSD.test(aov(m.V3), "V3", console = TRUE)
## 
## Study: aov(m.V3) ~ "V3"
## 
## HSD Test for y 
## 
## Mean Square Error:  109.8092 
## 
## V3,  means
## 
##          y       std  r         Min      Max
## A 15.94708  9.909170 30 -0.49038520 37.67078
## B 12.55691  9.615747 30 -0.72225742 35.94348
## C 13.04770  9.677720 30 -1.38321875 30.79361
## D 13.70005 13.007236 30 -1.55076953 42.34373
## E 14.99162 10.883890 30 -0.09663986 42.14185
## F 13.72283  9.001764 30 -1.43271495 34.87147
## G 13.49456  9.870767 30 -0.45813374 41.11890
## H 16.85756  9.929838 30  0.76688651 38.71058
## I 13.53177 11.355871 30 -0.17902314 38.72374
## J 10.56928 10.959496 30 -0.88269015 32.22231
## 
## alpha: 0.05 ; Df Error: 290 
## Critical Value of Studentized Range: 4.509303 
## 
## Honestly Significant Difference: 8.627165 
## 
## Means with the same letter are not significantly different.
## 
## Groups, Treatments and means
## a     H   16.86 
## a     A   15.95 
## a     E   14.99 
## a     F   13.72 
## a     D   13.7 
## a     I   13.53 
## a     G   13.49 
## a     C   13.05 
## a     B   12.56 
## a     J   10.57

Test sugeruje że nie ma istotnych różnic pomiedzy grupami

LSD.test(aov(m.V3), "V3", console = TRUE)
## 
## Study: aov(m.V3) ~ "V3"
## 
## LSD t Test for y 
## 
## Mean Square Error:  109.8092 
## 
## V3,  means and individual ( 95 %) CI
## 
##          y       std  r       LCL      UCL         Min      Max
## A 15.94708  9.909170 30 12.181577 19.71258 -0.49038520 37.67078
## B 12.55691  9.615747 30  8.791402 16.32241 -0.72225742 35.94348
## C 13.04770  9.677720 30  9.282197 16.81320 -1.38321875 30.79361
## D 13.70005 13.007236 30  9.934545 17.46555 -1.55076953 42.34373
## E 14.99162 10.883890 30 11.226121 18.75713 -0.09663986 42.14185
## F 13.72283  9.001764 30  9.957326 17.48833 -1.43271495 34.87147
## G 13.49456  9.870767 30  9.729054 17.26006 -0.45813374 41.11890
## H 16.85756  9.929838 30 13.092057 20.62306  0.76688651 38.71058
## I 13.53177 11.355871 30  9.766269 17.29727 -0.17902314 38.72374
## J 10.56928 10.959496 30  6.803774 14.33478 -0.88269015 32.22231
## 
## alpha: 0.05 ; Df Error: 290
## Critical Value of t: 1.968178 
## 
## t-Student: 1.968178
## Alpha    : 0.05
## Least Significant Difference 5.325225
## Means with the same letter are not significantly different
## 
## 
## Groups, Treatments and means
## a     H   16.85756 
## a     A   15.94708 
## ab    E   14.99162 
## ab    F   13.72283 
## ab    D   13.70005 
## ab    I   13.53177 
## ab    G   13.49456 
## ab    C   13.0477 
## ab    B   12.55691 
## b     J   10.56928

Test LSD sugeruje istnienie 3 grup.

scheffe.test(aov(m.V3), "V3", console = TRUE)
## 
## Study: aov(m.V3) ~ "V3"
## 
## Scheffe Test for y 
## 
## Mean Square Error  : 109.8092 
## 
## V3,  means
## 
##          y       std  r         Min      Max
## A 15.94708  9.909170 30 -0.49038520 37.67078
## B 12.55691  9.615747 30 -0.72225742 35.94348
## C 13.04770  9.677720 30 -1.38321875 30.79361
## D 13.70005 13.007236 30 -1.55076953 42.34373
## E 14.99162 10.883890 30 -0.09663986 42.14185
## F 13.72283  9.001764 30 -1.43271495 34.87147
## G 13.49456  9.870767 30 -0.45813374 41.11890
## H 16.85756  9.929838 30  0.76688651 38.71058
## I 13.53177 11.355871 30 -0.17902314 38.72374
## J 10.56928 10.959496 30 -0.88269015 32.22231
## 
## alpha: 0.05 ; Df Error: 290 
## Critical Value of F: 1.912236 
## 
## Minimum Significant Difference: 11.22446 
## 
## Means with the same letter are not significantly different.
## 
## Groups, Treatments and means
## a     H   16.86 
## a     A   15.95 
## a     E   14.99 
## a     F   13.72 
## a     D   13.7 
## a     I   13.53 
## a     G   13.49 
## a     C   13.05 
## a     B   12.56 
## a     J   10.57

Test Scheffe’go ponownie nie wykazuje istatnych róznic w srednich.

  1. Zweryfikuj istotność zależności od zmiennej V4
ggplot(df, aes(V4, y)) + geom_point() + geom_smooth(method = "lm")

m.V4 <- lm(y~V4, data = df)
anova(m.V4)
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value  Pr(>F)  
## V4          1    342  342.14  3.1522 0.07685 .
## Residuals 298  32345  108.54                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m.V4)
## 
## Call:
## lm(formula = y ~ V4, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.6935  -9.2155   0.5815   6.9727  27.5147 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   12.042      1.179  10.214   <2e-16 ***
## V4             3.696      2.082   1.775   0.0768 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.42 on 298 degrees of freedom
## Multiple R-squared:  0.01047,    Adjusted R-squared:  0.007146 
## F-statistic: 3.152 on 1 and 298 DF,  p-value: 0.07685

Wynik testu F wskazuje na to że nie ma istotnej zależnosci od zmiennej V4.

  1. Czy istotna jest interakcja pomiędzy V4 a V1? Jak pokazać tę zależność.
m.V1_V4 <- lm(y ~ V1+V4, data = df)
m.V1_V4_inter <- lm(y~V1*V4, data = df)

anova(m.V1_V4,m.V1_V4_inter)
## Analysis of Variance Table
## 
## Model 1: y ~ V1 + V4
## Model 2: y ~ V1 * V4
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    296 16791                           
## 2    294 16594  2    196.29 1.7388 0.1775

Wynik testu wskazuje na to że zależność interacji zmiennych V1 i V4 nie jest statystycznie istotna.

ggplot(df, aes(x = V4, y = y)) + geom_point() + geom_smooth(method = "lm") + facet_wrap(~V1, scales = "free_y")

7. Zweryfikuj zależność od zmiennej V5. A co jeżeli ta zależność nie jest liniowa? Sprawdź zależność od wielomianu stopnia 3.

ggplot(df, aes(V5, y)) + geom_point()

m.V5 <- lm(y ~ V5, data = df)
anova(m.V5)
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value Pr(>F)
## V5          1     37   37.18  0.3393 0.5607
## Residuals 298  32650  109.56
m.V5_poly2 <- lm(y ~ poly(V5,2), data = df)
m.V5_poly3 <- lm(y ~ poly(V5,3), data = df)
anova(m.V5_poly2)
## Analysis of Variance Table
## 
## Response: y
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## poly(V5, 2)   2  15800  7899.9  138.93 < 2.2e-16 ***
## Residuals   297  16888    56.9                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m.V5_poly2, m.V5_poly3)
## Analysis of Variance Table
## 
## Model 1: y ~ poly(V5, 2)
## Model 2: y ~ poly(V5, 3)
##   Res.Df   RSS Df Sum of Sq     F Pr(>F)
## 1    297 16888                          
## 2    296 16881  1    6.6165 0.116 0.7336
summary(m.V5_poly3)
## 
## Call:
## lm(formula = y ~ poly(V5, 3), data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.176 -5.598 -4.141  7.145 16.273 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    13.842      0.436  31.747   <2e-16 ***
## poly(V5, 3)1   -6.098      7.552  -0.807    0.420    
## poly(V5, 3)2  125.549      7.552  16.625   <2e-16 ***
## poly(V5, 3)3    2.572      7.552   0.341    0.734    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.552 on 296 degrees of freedom
## Multiple R-squared:  0.4836, Adjusted R-squared:  0.4783 
## F-statistic: 92.39 on 3 and 296 DF,  p-value: < 2.2e-16

Zarówno wykres rozkładu zmiennej V5 w stostunku do y jak i test wskazują na zależnośc od wielomianu stopnia drugie zmiennej.

  1. Zbuduj nową zmienną NV := V4 - 2*V5. Zbadaj związek z tą zmienną.
df_NV <- df %>%
        mutate(NV = V4 - 2*V5)
ggplot(df_NV, aes(NV, y)) + geom_point()

m.NV  <- lm(y~NV, data = df_NV)
anova(m.NV)
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value Pr(>F)
## NV          1    129  128.89  1.1797 0.2783
## Residuals 298  32558  109.26
m.NV2  <- lm(y~poly(NV,2), data = df_NV)
anova(m.NV2)
## Analysis of Variance Table
## 
## Response: y
##              Df Sum Sq Mean Sq F value    Pr(>F)    
## poly(NV, 2)   2  10305  5152.7  68.374 < 2.2e-16 ***
## Residuals   297  22382    75.4                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Po stworzeniu nowej zmiennej będącej kombinacją liniową zmiennych V4 i V5 ponownie możemy stwierdzić istotną zależność od wielomianu stopnia zmiennej NV

  1. Wybierz model optymalny według kryterium BIC - zrób przegląd pełny wszystkich modeli.
vars <- matrix(c(colnames(df[,-1]), "poly(V5,2)"))
coefs <- (bincombinations(length(vars))==1)[-1,]

form_from_coefs <- function(coefs_row) {
  paste("y~", paste(vars[coefs_row], collapse="+"))
} 
forms <- apply(coefs, 1, form_from_coefs) 
params <- forms %>%
          map_df(~ glance(lm(as.formula(.x), data = df)))
best_formula <- as.formula(forms[which.min(t(params[,"BIC"]))])
best_model <- lm(best_formula, data = df)
print(best_formula)
## y ~ V1 + V4 + poly(V5, 2)
print(summary(best_model))
## 
## Call:
## lm(formula = best_formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5937 -1.1985  0.0163  1.1776  4.7132 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   22.1398     0.2530  87.515  < 2e-16 ***
## V1A          -15.3523     0.2490 -61.650  < 2e-16 ***
## V1C          -15.0520     0.2495 -60.336  < 2e-16 ***
## V4             3.7713     0.3530  10.685  < 2e-16 ***
## poly(V5, 2)1   4.8291     1.7611   2.742  0.00648 ** 
## poly(V5, 2)2 126.5130     1.7610  71.840  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.753 on 294 degrees of freedom
## Multiple R-squared:  0.9723, Adjusted R-squared:  0.9719 
## F-statistic:  2067 on 5 and 294 DF,  p-value: < 2.2e-16
  1. Wybierz model optymalny według kryterium AIC - użyj funkcji step.
step.aic.backward <- stepAIC(lm(y~.+poly(V5,2),data=df),direction = "backward", trace = TRUE)
## Start:  AIC=354.44
## y ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + poly(V5, 
##     2)
## 
## 
## Step:  AIC=354.44
## y ~ V1 + V2 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + poly(V5, 2)
## 
##               Df Sum of Sq     RSS     AIC
## - V2           2       0.5   844.9  350.62
## - V3           9      46.0   890.3  352.34
## - V7           1       0.1   844.5  352.48
## - V6           1       0.4   844.8  352.59
## - V10          1       1.1   845.4  352.82
## - V9           1       4.5   848.8  354.03
## <none>                       844.4  354.44
## - V8           1       7.2   851.5  354.97
## - V4           1     342.1  1186.4  454.48
## - V1           2   14692.3 15536.6 1224.15
## - poly(V5, 2)  2   14912.9 15757.3 1228.38
## 
## Step:  AIC=350.62
## y ~ V1 + V3 + V4 + V6 + V7 + V8 + V9 + V10 + poly(V5, 2)
## 
##               Df Sum of Sq     RSS     AIC
## - V3           9      46.2   891.0  348.58
## - V7           1       0.1   845.0  348.67
## - V6           1       0.6   845.4  348.82
## - V10          1       1.1   845.9  349.00
## - V9           1       4.5   849.4  350.21
## <none>                       844.9  350.62
## - V8           1       7.4   852.3  351.24
## - V4           1     342.8  1187.6  450.78
## - V1           2   14825.6 15670.5 1222.73
## - poly(V5, 2)  2   14976.7 15821.6 1225.60
## 
## Step:  AIC=348.58
## y ~ V1 + V4 + V6 + V7 + V8 + V9 + V10 + poly(V5, 2)
## 
##               Df Sum of Sq     RSS     AIC
## - V6           1       0.0   891.0  346.58
## - V7           1       0.3   891.3  346.67
## - V10          1       0.5   891.5  346.74
## - V9           1       4.7   895.7  348.16
## <none>                       891.0  348.58
## - V8           1       7.9   898.9  349.21
## - V4           1     332.7  1223.7  441.76
## - V1           2   14851.2 15742.2 1206.09
## - poly(V5, 2)  2   15788.0 16679.1 1223.44
## 
## Step:  AIC=346.58
## y ~ V1 + V4 + V7 + V8 + V9 + V10 + poly(V5, 2)
## 
##               Df Sum of Sq     RSS     AIC
## - V7           1       0.3   891.3  344.68
## - V10          1       0.5   891.5  344.75
## - V9           1       4.7   895.8  346.17
## <none>                       891.0  346.58
## - V8           1       7.8   898.9  347.21
## - V4           1     333.1  1224.2  439.87
## - V1           2   14938.0 15829.0 1205.75
## - poly(V5, 2)  2   15788.9 16680.0 1221.45
## 
## Step:  AIC=344.68
## y ~ V1 + V4 + V8 + V9 + V10 + poly(V5, 2)
## 
##               Df Sum of Sq     RSS     AIC
## - V10          1       0.5   891.8  342.84
## - V9           1       4.8   896.1  344.29
## <none>                       891.3  344.68
## - V8           1       7.7   899.0  345.27
## - V4           1     337.6  1228.9  439.02
## - V1           2   14938.4 15829.8 1203.76
## - poly(V5, 2)  2   15871.1 16762.4 1220.93
## 
## Step:  AIC=342.84
## y ~ V1 + V4 + V8 + V9 + poly(V5, 2)
## 
##               Df Sum of Sq     RSS     AIC
## - V9           1       4.7   896.5  342.41
## <none>                       891.8  342.84
## - V8           1       7.5   899.4  343.37
## - V4           1     337.1  1228.9  437.03
## - V1           2   15031.7 15923.6 1203.53
## - poly(V5, 2)  2   15887.3 16779.1 1219.23
## 
## Step:  AIC=342.41
## y ~ V1 + V4 + V8 + poly(V5, 2)
## 
##               Df Sum of Sq     RSS     AIC
## <none>                       896.5  342.41
## - V8           1       7.5   903.9  342.89
## - V4           1     347.0  1243.5  438.58
## - V1           2   15055.5 15952.0 1202.07
## - poly(V5, 2)  2   15883.2 16779.7 1217.24
step.aic.backward
## 
## Call:
## lm(formula = y ~ V1 + V4 + V8 + poly(V5, 2), data = df)
## 
## Coefficients:
##  (Intercept)           V1A           V1C            V4            V8  
##      22.3461      -15.3996      -15.0950        3.7523       -0.1636  
## poly(V5, 2)1  poly(V5, 2)2  
##       5.0935      126.4910
min.model = lm(y ~ 1, data=df)
biggest <- formula(lm(y~. + poly(V5,2),df))
step.aic.forward <- stepAIC(min.model, direction = "forward", scope = biggest)
## Start:  AIC=1409.29
## y ~ 1
## 
##               Df Sum of Sq   RSS    AIC
## + V1           2   15813.2 16874 1214.9
## + poly(V5, 2)  2   15799.8 16888 1215.2
## + V4           1     342.1 32345 1408.1
## + V10          1     228.2 32459 1409.2
## <none>                     32687 1409.3
## + V7           1     161.0 32526 1409.8
## + V2           2     369.7 32318 1409.9
## + V8           1     138.0 32549 1410.0
## + V6           1     105.5 32582 1410.3
## + V9           1      47.4 32640 1410.8
## + V5           1      37.2 32650 1411.0
## + V3           9     842.7 31845 1419.5
## 
## Step:  AIC=1214.93
## y ~ V1
## 
##               Df Sum of Sq     RSS     AIC
## + poly(V5, 2)  2   15619.2  1254.9  439.32
## <none>                     16874.1 1214.93
## + V7           1      96.1 16778.1 1215.21
## + V4           1      83.5 16790.7 1215.44
## + V10          1      22.4 16851.8 1216.53
## + V5           1      15.7 16858.4 1216.65
## + V2           2     125.4 16748.7 1216.69
## + V8           1      13.3 16860.8 1216.69
## + V9           1       2.6 16871.6 1216.88
## + V6           1       0.2 16874.0 1216.92
## + V3           9     842.7 16031.4 1217.56
## 
## Step:  AIC=439.32
## y ~ V1 + poly(V5, 2)
## 
##        Df Sum of Sq     RSS    AIC
## + V4    1    351.00  903.94 342.89
## + V9    1     14.49 1240.45 437.83
## + V8    1     11.42 1243.52 438.58
## <none>              1254.93 439.32
## + V7    1      4.86 1250.08 440.15
## + V6    1      1.30 1253.64 441.01
## + V10   1      0.32 1254.62 441.24
## + V2    2      4.04 1250.90 442.35
## + V3    9     37.40 1217.53 448.24
## 
## Step:  AIC=342.89
## y ~ V1 + poly(V5, 2) + V4
## 
##        Df Sum of Sq    RSS    AIC
## + V8    1     7.455 896.48 342.41
## <none>              903.94 342.89
## + V9    1     4.582 899.35 343.37
## + V7    1     0.223 903.71 344.82
## + V10   1     0.208 903.73 344.82
## + V6    1     0.005 903.93 344.89
## + V3    9    46.156 857.78 345.17
## + V2    2     1.231 902.71 346.48
## 
## Step:  AIC=342.41
## y ~ V1 + poly(V5, 2) + V4 + V8
## 
##        Df Sum of Sq    RSS    AIC
## <none>              896.48 342.41
## + V9    1     4.670 891.81 342.84
## + V7    1     0.356 896.12 344.29
## + V10   1     0.349 896.13 344.29
## + V6    1     0.007 896.47 344.41
## + V3    9    45.568 850.91 344.76
## + V2    2     0.845 895.64 346.13
  1. Wykonaj diagnostykę reszt. Czy są obserwacje odstające/wpływowe?
best_model <- lm(y ~ V1 + V4 + poly(V5, 2), data = df)
plot(best_model, which = 1:6)

Wykresy diagnostyczne wskazują kilka obserwacji odstających - 53, 127, 140 ktorę mogę miec znaczący wpływ na model.

bptest(best_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  best_model
## BP = 32.242, df = 5, p-value = 5.321e-06

Test wskazuje na to że nie podstaw do odrzuceniu hiptezy o jednorodności reszt.

dwtest(best_model)
## 
##  Durbin-Watson test
## 
## data:  best_model
## DW = 2.0732, p-value = 0.7531
## alternative hypothesis: true autocorrelation is greater than 0
bgtest(best_model)
## 
##  Breusch-Godfrey test for serial correlation of order up to 1
## 
## data:  best_model
## LM test = 0.41097, df = 1, p-value = 0.5215

Test nie wykazuje istanienia znaczącej korelacji rzedu 1 miedzy resztami.

shapiro.test(best_model$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  best_model$residuals
## W = 0.99701, p-value = 0.8548

Nie odrzucamy hipotezy o rozkładzie normalnym reszt.

raintest(best_model)
## 
##  Rainbow test
## 
## data:  best_model
## Rain = 0.92273, df1 = 150, df2 = 144, p-value = 0.6871

Nie ma podstaw do odrzucenia hipotezy o liniowosci.

resettest(best_model)
## 
##  RESET test
## 
## data:  best_model
## RESET = 0.056705, df1 = 2, df2 = 292, p-value = 0.9449
  1. Zweryfikuj istotność interakcji V6 i V7.
model_v6_v7_no_interactions <- lm(y~V6+V7, data = df)
model_v6_v7_interactions <- lm(y~V6*V7, data = df)
anova(model_v6_v7_no_interactions, model_v6_v7_interactions)
## Analysis of Variance Table
## 
## Model 1: y ~ V6 + V7
## Model 2: y ~ V6 * V7
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    297 32405                           
## 2    296 32371  1    33.162 0.3032 0.5823
anova(model_v6_v7_interactions)
## Analysis of Variance Table
## 
## Response: y
##            Df Sum Sq Mean Sq F value Pr(>F)
## V6          1    105 105.494  0.9646 0.3268
## V7          1    177 177.389  1.6220 0.2038
## V6:V7       1     33  33.162  0.3032 0.5823
## Residuals 296  32371 109.363

Iterackje pomiędzy zmiennymi V6 i V7 nie mają wpływu na

  1. Porównaj wyniki z wynikami funkcji ctree pakiet partykit.
model_ctree <- ctree(formula(best_model), data = df)
model_ctree
## 
## Model formula:
## y ~ V1 + V4 + poly(V5, 2)
## 
## Fitted party:
## [1] root
## |   [2] V1 in B: 24.099 (n = 100, err = 6269.2)
## |   [3] V1 in A, C: 8.714 (n = 200, err = 10638.2)
## 
## Number of inner nodes:    1
## Number of terminal nodes: 2
plot(model_ctree)

Funkcja ctree wskazuje że najwięskze znaczenie ma zmienna V1, grupach B i A,C co jest zgodne z naszymi wczesniejszymi obserwacjami.

  1. Użyj funkcji optim() aby znaleźć oceny współczynników z kryterium do optymalizacji abs(y - Xb)
y = df[,1]
dummies <- caret::dummyVars(~V1 + V2 + V3, data =df)
X = model.matrix(y~., data =df)
k = dim(X)[2]
beta_0 = matrix(rep(0, k))

l1_loss <- function(beta) {
  sum(abs(y - X %*% beta))
}

optim(beta_0, l1_loss)
## $par
##             [,1]
##  [1,]  1.6394320
##  [2,]  0.1001619
##  [3,] -1.3112407
##  [4,]  1.0906653
##  [5,]  1.3970504
##  [6,] -2.4964825
##  [7,]  2.2892887
##  [8,] -3.8944799
##  [9,]  2.5395681
## [10,]  0.2339755
## [11,] -1.2687781
## [12,] -0.5731605
## [13,] -5.7322079
## [14,] -2.4213077
## [15,]  1.8107060
## [16,] -1.6191950
## [17,]  1.0604801
## [18,]  1.6834477
## [19,]  3.1678633
## [20,]  2.0482337
## [21,]  1.3704379
## 
## $value
## [1] 2704.123
## 
## $counts
## function gradient 
##      502       NA 
## 
## $convergence
## [1] 1
## 
## $message
## NULL
  1. Funkcja rlm z pakietu MASS wykonuje regresję odporną. Sprawdź jak wpłynie ona na ocenę współczynników.
robust_model <-rlm(formula(best_model), data = df)
summary(robust_model)
## 
## Call: rlm(formula = formula(best_model), data = df)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -4.62538 -1.09093  0.01938  1.17411  4.93880 
## 
## Coefficients:
##              Value    Std. Error t value 
## (Intercept)   22.2818   0.2641    84.3729
## V1A          -15.3083   0.2600   -58.8885
## V1C          -15.0175   0.2604   -57.6671
## V4             3.4129   0.3685     9.2624
## poly(V5, 2)1   5.3450   1.8384     2.9075
## poly(V5, 2)2 126.7844   1.8383    68.9671
## 
## Residual standard error: 1.718 on 294 degrees of freedom
anova(robust_model)
## Analysis of Variance Table
## 
## Response: y
##             Df  Sum Sq Mean Sq F value Pr(>F)
## V1           2 14484.7  7242.4               
## V4           1    43.2    43.2               
## poly(V5, 2)  2 15209.7  7604.9               
## Residuals        907.9
tidy(best_model)
##           term   estimate std.error statistic       p.value
## 1  (Intercept)  22.139752 0.2529835  87.51460 1.399858e-212
## 2          V1A -15.352348 0.2490239 -61.65011 3.414359e-170
## 3          V1C -15.051955 0.2494683 -60.33614 1.211301e-167
## 4           V4   3.771335 0.3529703  10.68457  9.893235e-23
## 5 poly(V5, 2)1   4.829064 1.7610565   2.74214  6.478323e-03
## 6 poly(V5, 2)2 126.513003 1.7610392  71.83997 1.651162e-188
tidy(robust_model)
##           term   estimate std.error  statistic
## 1  (Intercept)  22.281795 0.2640871  84.372891
## 2          V1A -15.308282 0.2599537 -58.888500
## 3          V1C -15.017542 0.2604176 -57.667148
## 4           V4   3.412858 0.3684624   9.262431
## 5 poly(V5, 2)1   5.344980 1.8383504   2.907487
## 6 poly(V5, 2)2 126.784438 1.8383323  68.967094
plot(robust_model, which= 1:6)

Współczynniki dla modelu odpornego na odchylenia są zbliżone do modelu liniowego. Większą roznice, rzedu 20% wartosic wspolczynnika można zaobserwać dla zmiennej V5(czynnik liniowy)